# 导入模拟HTTP请求的包
import requests
# 导入用于分析的包
from bs4 import BeautifulSoup
import lxml
# 引入正则的模块
import re
# 导入os
import os
# 引入urllib的request
from urllib import request

# 引入pymysql
import pymysql
# 引入 time
import time
import threading

# 准备一个BASE_URL
BASE_URL = "http://www.ilync.cn/org/6818_d_0_0_-1_-1_0_"
COURSE_DETAIL_URL = "http://www.ilync.cn/kecheng/detail_{}?f=org_coursecenter"
DB_CONN = {
    'HOST': '192.168.182.5',
    'USER': 'root',
    'PASSWORD': '1234.Com',
    'NAME': 'DB07',
}


def get_content(url: str):
    """对某一个url进行HTTP请求，获取返回的文本"""
    # 定义一个heahers
    header = {
        'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"
    }
    response = requests.get(url, headers=header)
    response_text = response.content.decode("utf-8")
    # 返回请求的文本
    return response_text

"""
def save_image(url: str):
 
    # 根据URL获取图片名称
    file_name = url.split("/")[-1]
    # 准备path
    path = os.path.join('images', file_name)
    # 直接存储
    request.urlretrieve(url, path)
"""

def get_pages(one_page_number: int):
    """获取当前的"""
    # 拼接一个URL
    url = BASE_URL + "1"
    # 获取结果
    content = get_content(url)
    # 实例化BS4对象
    soup = BeautifulSoup(content, 'lxml')
    # 第一次筛选
    total_courses = int(soup.find('input', id='countCourseNum').attrs['value'])
    # 计算页码
    return int(total_courses / one_page_number) + 1


def get_pages_url(pages: int):
    """拼接页码"""
    # 定义一个list
    page_list = []
    # 循环
    for page in range(1, pages + 1):
        # 附加到list
        page_list.append(BASE_URL + str(page))

    # 返回
    return page_list


def get_course_detail_info(url: str):
    """爬取课程的明细：多少次观看，多少次评价，多少次收藏"""
    # 获取明细也的内容
    content = get_content(url)

    # 实例化BS4对象
    soup = BeautifulSoup(content, 'lxml')
    # 筛选
    first_filter = soup.find_all('span', class_="fcz")  # 返回list
    # 定义一个temp_dict
    temp_dict = {}
    # 添加学习次数
    temp_dict['study_times'] = int(re.findall(r"\d{1,}", str(first_filter[0]))[0])
    # 添加评价次数
    temp_dict['evaluate_times'] = int(re.findall(r"\d{1,}", str(first_filter[1]))[0])
    # 添加收藏次数
    temp_dict['collect_times'] = int(re.findall(r"\d{1,}", str(first_filter[2]))[0])
    # 返回
    return temp_dict


def get_course_info(url: str):
    """根据文本信息抓取内容 """
    content = get_content(url)
    # 实例化BS4对象
    soup = BeautifulSoup(content, 'lxml')
    # 第一次筛选
    first_filter = soup.find_all('div', class_='course-list-wrap')[0]
    # 第二次筛选
    second_filter = first_filter.find_all('div', class_="grid-cell")
    # 定义一个集合 --- [{},{},{},{},....]
    courses_infos = []
    # 使用循环来遍历第二次筛选的内容
    for one in second_filter:
        # 临时定义个字典
        temp_dict = {}
        # 获取课程的Id
        id_str = one.find('a', class_="course-pic").attrs['href']
        id = id_str[id_str.find("_") + 1:id_str.find("?")]
        # 添加id到字典
        temp_dict['id'] = id
        # 把课程的URL添加到集合中
        temp_dict['url'] = COURSE_DETAIL_URL.format(id)

        # 获取图片地址
        img = one.find('img').attrs['src']
        temp_dict['img'] = img
        # 下载图片

        # 获取课程名称
        title = one.find('img').attrs['title']
        temp_dict['title'] = title
        # 获取课程类别
        type = one.find('div', class_="course-courseware-cate").text
        temp_dict['type'] = type
        # 价格--
        price_list = one.find('div', class_="course-price").text.replace("\t", "").split('\n')
        # 遍历并判断
        for one_price in price_list:
            if "免费" in one_price or "." in one_price:
                temp_dict['price'] = one_price
        # 课时
        time = one.find('div', class_='course-courseware-num').find('span').text
        temp_dict['time'] = time
        # 附加到集合
        courses_infos.append(temp_dict)

    # 返回
    print(courses_infos)


def get_all_course_line(urls: list):
    """爬取所有的页"""
    # 定义一个list存储结果
    all_course = []
    # 开始遍历
    for url in urls:
        # 根据url爬取这一页的课程
        get_course_info(url)


def get_all_course_async(urls: list):
    # 总过12页，为 每页分配一个进程
    while len(urls) > 0:
        # 获取一个url
        url = urls.pop()  # 把列表的最后一个给删除并返回
        # 创建一个线程
        th = threading.Thread(target=get_course_info, args=[url])
        # 开启线程
        th.start()


"""
def save_to_mysql(courses: list):
    # 实例化一个mysql连接
    mysql_conn = pymysql.connect(DB_CONN['HOST'], DB_CONN['USER'],DB_CONN['PASSWORD'], DB_CONN['NAME'])
    # 获取操作指针
    cursor = mysql_conn.cursor()
    # 使用它循环
    try:
        # 开始循环
        for course in courses:
            # 准备SQL语句
            sql = "Insert into Course(id, cname, url,image,type,times,price,study_times,evaluation_times,collect_times) " + \
                  "value ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')" % (course['id'],course['title'], course['url'],
                    course['img'],course['type'],course['time'],course['price'],course['study_times'],
                                                course['evaluate_times'], course['collect_times'])
            # 执行
            cursor.execute(sql)
            # 提交到数据库
            mysql_conn.commit()

    except Exception as e:
        # 回退
        mysql_conn.rollback()
        # 提示
        print("写入到数据库出现异常，具体原因：" + str(e))
    finally:
        # 关闭连接
        mysql_conn.close()
"""

if __name__ == '__main__':

    # 获取有多少页
    pages = get_pages(24)
    # 拼接每一页的URLS
    urls = get_pages_url(pages)
    # 获取整个页面的课程
    #get_all_course_line(urls) # 单线程
    # 多线程执行
    get_all_course_async(urls)


